Getting the data

# don't read again, we've already got it
if (!exists('customers_org')){
  readData(customers_file, subscriptions_file, gdp_file)
}

Optionally sample the data for faster development

customers <- customers_org %>%
  processCustomers(SAMPLE_DATA, fraction_sample)
subscriptions <- subscriptions_org %>%
  # Removing customers not in the customers set
  filter(customerid %in% customers$customerid) %>%
  processSubscriptions() %>%
  joinProcessCustomers(customers, age_to_join_sitevers = 5) %>%
  joinProcessGDP(gdp) %>%
  concatCategrories() %>%
  mutate_if(is.character, funs(as.factor))
  
summary(subscriptions)
   customerid       subscriptionid      revenuecurr        billingcurrency    startmonth            endmonth              months          status        num_previous_months
 Min.   :   10006   Min.   :     154   Min.   :      0.8   DKK    :668540   Min.   :2003-12-01   Min.   :2004-03-01   Min.   : 1.000   active:1840623   Min.   :  0.00     
 1st Qu.: 1090111   1st Qu.: 5121661   1st Qu.:     15.0   EUR    :532574   1st Qu.:2012-12-01   1st Qu.:2013-04-01   1st Qu.: 1.000   churn : 442541   1st Qu.:  1.00     
 Median : 5319576   Median :14784414   Median :     63.2   USD    :505039   Median :2014-12-01   Median :2015-03-01   Median : 3.000                    Median :  8.00     
 Mean   : 8044651   Mean   :14612438   Mean   :   1382.8   NOK    :232550   Mean   :2014-05-02   Mean   :2014-08-18   Mean   : 3.532                    Mean   : 15.34     
 3rd Qu.:14410806   3rd Qu.:24044332   3rd Qu.:    174.0   SEK    :159059   3rd Qu.:2016-07-01   3rd Qu.:2016-11-01   3rd Qu.: 3.000                    3rd Qu.: 22.00     
 Max.   :23917123   Max.   :29424356   Max.   :2823000.0   GBP    : 96217   Max.   :2018-03-01   Max.   :2020-03-01   Max.   :24.000                    Max.   :162.00     
                                                           (Other): 89185                                                                                                  
 num_previous_subs num_previous_months_binned firstpaiddate        channelcat      paymentperiodchosenatstart    currency        marketname       siteverkey     firstpaidmonth      
 Min.   : 0.000    Min.   : 0.00              Min.   :2003-12-15   paid :1518216   Min.   :-1.000             DKK    :668679   DK     :669017   US     :898770   Min.   :2003-12-01  
 1st Qu.: 1.000    1st Qu.: 1.00              1st Qu.:2010-12-12   viral: 764948   1st Qu.: 1.000             USD    :543314   NO     :232881   DK     :637715   1st Qu.:2010-12-01  
 Median : 3.000    Median : 8.00              Median :2013-06-23                   Median : 3.000             EUR    :537318   US     :188723   NO     :207482   Median :2013-06-01  
 Mean   : 6.028    Mean   :14.72              Mean   :2013-01-16                   Mean   : 4.237             NOK    :232433   SE     :159905   SE     :144173   Mean   :2013-01-02  
 3rd Qu.: 8.000    3rd Qu.:26.00              3rd Qu.:2015-08-19                   3rd Qu.: 3.000             SEK    :158858   FR     :146174   FR     : 96298   3rd Qu.:2015-08-01  
 Max.   :68.000    Max.   :39.00              Max.   :2018-03-26                   Max.   :24.000             GBP    : 93847   (Other):886057   NL     : 79701   Max.   :2018-03-01  
                                                                                                              (Other): 48715   NA's   :   407   (Other):219025                       
  firstdevice          segment        isquickpurchase  productversion      isfreemium                                model31224        market_category   siteverkey_cat siteverkey_cat2
 desktop: 175519   business: 407581   Min.   :0.0000   v_3    : 898821   Min.   :0.0000   pre-changes                     :1894566   DK        :669017   ORG:1384394    MUT:1311010    
 mobile :  70157   other   :  97467   1st Qu.:0.0000   v_4    : 138959   1st Qu.:0.0000   3-12m-v1-2015-11-30             : 145756   LowGeneric:265151   SS : 898770    ORG: 402042    
 NA's   :2037488   personal: 293518   Median :1.0000   v_older:1245078   Median :0.0000   3-12-24m-2016-08-25             : 144551   NO        :232881                  SS : 570112    
                   NA's    :1484598   Mean   :0.5282   NA's   :    306   Mean   :0.2547   12m-v2-2015-05-16               :  71866   US        :188723                                 
                                      3rd Qu.:1.0000                     3rd Qu.:1.0000   ex-subscriptionplanpaywallexpand:   8305   SE        :159905                                 
                                      Max.   :1.0000                     Max.   :1.0000   ex-uglyjerry12mswitch           :   5388   FR        :146174                                 
                                      NA's   :306                        NA's   :306      (Other)                         :  12732   (Other)   :621313                                 
 chosen_subs_length  gdppercapita      gdppercapita_scaled                           subscription_summary            subscription_summary_no_market
 1  :  56935        Min.   :   218.3   Min.   :-2.4437     mc-DK_ssc-MUT_ac-39_m-3_ccsl-gen    : 109608   ssc-MUT_ac-26_m-3_ccsl-gen: 203834       
 -1 :  11140        1st Qu.: 42013.3   1st Qu.:-0.4694     mc-DK_ssc-MUT_ac-26_m-3_ccsl-gen    :  77645   ssc-MUT_ac-39_m-3_ccsl-gen: 170628       
 12 :  51360        Median : 55670.9   Median : 0.1758     mc-LowGeneric_ssc-SS_ac-0_m-1_ccsl-3:  60652   ssc-SS_ac-0_m-1_ccsl-3    : 161585       
 24 :   6897        Mean   : 51950.1   Mean   : 0.0000     mc-DK_ssc-ORG_ac-0_m-3_ccsl-gen     :  58390   ssc-ORG_ac-0_m-3_ccsl-gen : 132393       
 3  : 204319        3rd Qu.: 60637.3   3rd Qu.: 0.4104     mc-DK_ssc-MUT_ac-38_m-3_ccsl-gen    :  51997   ssc-MUT_ac-8_m-3_ccsl-gen : 120482       
 gen:1952513        Max.   :108422.5   Max.   : 2.6676     mc-DK_ssc-ORG_ac-3_m-3_ccsl-gen     :  41933   ssc-MUT_ac-38_m-3_ccsl-gen: 111133       
                                                           (Other)                             :1882939   (Other)                   :1383109       
subscriptions_with_target <- subscriptions %>%
  # restrict to a recent expiry window
  filter(endmonth >= begin_train_window & endmonth < end_window) %>%
  mutate(num_previous_months_binned_fct = as.factor(num_previous_months_binned)) %>%
  mutate(set_type = as.factor(if_else(endmonth >= begin_validation_window, 'validation', 'training'))) %>%
  
  mutate(churnind = ifelse(status == 'churn', 1, 0)) 

Prepare churntable that we want to predict.

churntable <- subscriptions_with_target %>%
  
  group_by(set_type, siteverkey_cat2, market_category, months, num_previous_months_binned, chosen_subs_length, subscription_summary_no_market) %>%
  summarise(num_obs = n(), 
            churned = sum(churnind)) %>%
  
  group_by(set_type) %>%
  mutate(churn_rate = churned / num_obs,
         renew_rate = 1 - churn_rate,
         month_churn = 1 - renew_rate ^ (1/as.double(months)),
         log_month_churn = log(month_churn),
         weight = num_obs / sum(num_obs))
# NB! Does this introduce a bad bias ????
churntable_no_zeros <- churntable %>%
  filter(churn_rate > 0)

Train model

new_model=glm(log_month_churn ~ market_category + subscription_summary_no_market, data=churntable_no_zeros[churntable_no_zeros$set_type == 'training', ], weights = weight)

Model validation for training (2017-01-01 - 2017-08-01) and validation (2017-09-01 - 2018-01-01) sets:

prediction_table <- validation(subscriptions_with_target, new_model, predict_2fct_model)

validation_plots(prediction_table)
NAs introduced by coercion

Try simple logistic model

model_logit <- glm(churnind ~ market_category + siteverkey_cat2 + num_previous_months_binned + months + chosen_subs_length,
                   data = subscriptions_with_target[subscriptions_with_target$set_type == 'training',], family = 'binomial')

Model validation for training (2017-01-01 - 2017-08-01) and validation (2017-09-01 - 2018-01-01) sets:

prediction_table_logit <- validation(subscriptions_with_target, model_logit)

validation_plots(prediction_table_logit)
NAs introduced by coercion

LS0tCnRpdGxlOiAiQ2h1cm46IFR3byBkaW1lbnNpb25zIGFuZCBwcmVkaWN0IGxvZyBvZiBtb250aGx5IGNodXJuIHByb2JhYmlsaXR5IgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0Kc291cmNlKCdjb25maWcuUicpCnNvdXJjZSgndXRpbHMuUicpCnNvdXJjZSgndXRpbHNfdmFsaWRhdGlvbi5SJykKCiMgc2FtcGxpbmcgZm9yIGRldgojIFNBTVBMRV9EQVRBIDwtIFRSVUUKU0FNUExFX0RBVEEgPC0gRkFMU0UKZnJhY3Rpb25fc2FtcGxlIDwtIDAuMDEgIyB1c2UgdGhpcyB0byBzbGltIGRhdGEgZm9yIGZhc3RlciBleHBlcmltZW50YXRpb24KCiMgc291cmNlKCduZXdfbGlmZXRpbWVwcmVkaWN0b3IuUicpCmBgYAoKR2V0dGluZyB0aGUgZGF0YQoKYGBge3IgcmVhZGluZywgdGlkeT1GfQojIGRvbid0IHJlYWQgYWdhaW4sIHdlJ3ZlIGFscmVhZHkgZ290IGl0CmlmICghZXhpc3RzKCdjdXN0b21lcnNfb3JnJykpewogIHJlYWREYXRhKGN1c3RvbWVyc19maWxlLCBzdWJzY3JpcHRpb25zX2ZpbGUsIGdkcF9maWxlKQp9CmBgYAoKT3B0aW9uYWxseSBzYW1wbGUgdGhlIGRhdGEgZm9yIGZhc3RlciBkZXZlbG9wbWVudAoKYGBge3IgY3VzdG9tZXJzX3Byb2Nlc3NpbmcsIHRpZHk9Rn0KY3VzdG9tZXJzIDwtIGN1c3RvbWVyc19vcmcgJT4lCiAgcHJvY2Vzc0N1c3RvbWVycyhTQU1QTEVfREFUQSwgZnJhY3Rpb25fc2FtcGxlKQpgYGAKCmBgYHtyIHN1YnNjcmlwdGlvbnNfcHJvY2Vzc2luZywgdGlkeT1GfQoKc3Vic2NyaXB0aW9ucyA8LSBzdWJzY3JpcHRpb25zX29yZyAlPiUKICAjIFJlbW92aW5nIGN1c3RvbWVycyBub3QgaW4gdGhlIGN1c3RvbWVycyBzZXQKICBmaWx0ZXIoY3VzdG9tZXJpZCAlaW4lIGN1c3RvbWVycyRjdXN0b21lcmlkKSAlPiUKCiAgcHJvY2Vzc1N1YnNjcmlwdGlvbnMoKSAlPiUKCiAgam9pblByb2Nlc3NDdXN0b21lcnMoY3VzdG9tZXJzLCBhZ2VfdG9fam9pbl9zaXRldmVycyA9IDUpICU+JQoKICBqb2luUHJvY2Vzc0dEUChnZHApICU+JQoKICBjb25jYXRDYXRlZ3JvcmllcygpICU+JQoKICBtdXRhdGVfaWYoaXMuY2hhcmFjdGVyLCBmdW5zKGFzLmZhY3RvcikpCiAgCmBgYAoKYGBge3J9CnN1bW1hcnkoc3Vic2NyaXB0aW9ucykKYGBgCgpgYGB7cn0Kc3Vic2NyaXB0aW9uc193aXRoX3RhcmdldCA8LSBzdWJzY3JpcHRpb25zICU+JQogICMgcmVzdHJpY3QgdG8gYSByZWNlbnQgZXhwaXJ5IHdpbmRvdwogIGZpbHRlcihlbmRtb250aCA+PSBiZWdpbl90cmFpbl93aW5kb3cgJiBlbmRtb250aCA8IGVuZF93aW5kb3cpICU+JQogIG11dGF0ZShudW1fcHJldmlvdXNfbW9udGhzX2Jpbm5lZF9mY3QgPSBhcy5mYWN0b3IobnVtX3ByZXZpb3VzX21vbnRoc19iaW5uZWQpKSAlPiUKICBtdXRhdGUoc2V0X3R5cGUgPSBhcy5mYWN0b3IoaWZfZWxzZShlbmRtb250aCA+PSBiZWdpbl92YWxpZGF0aW9uX3dpbmRvdywgJ3ZhbGlkYXRpb24nLCAndHJhaW5pbmcnKSkpICU+JQogIAogIG11dGF0ZShjaHVybmluZCA9IGlmZWxzZShzdGF0dXMgPT0gJ2NodXJuJywgMSwgMCkpIApgYGAKCgpQcmVwYXJlIGNodXJudGFibGUgdGhhdCB3ZSB3YW50IHRvIHByZWRpY3QuCgpgYGB7cn0KY2h1cm50YWJsZSA8LSBzdWJzY3JpcHRpb25zX3dpdGhfdGFyZ2V0ICU+JQogIAogIGdyb3VwX2J5KHNldF90eXBlLCBzaXRldmVya2V5X2NhdDIsIG1hcmtldF9jYXRlZ29yeSwgbW9udGhzLCBudW1fcHJldmlvdXNfbW9udGhzX2Jpbm5lZCwgY2hvc2VuX3N1YnNfbGVuZ3RoLCBzdWJzY3JpcHRpb25fc3VtbWFyeV9ub19tYXJrZXQpICU+JQogIHN1bW1hcmlzZShudW1fb2JzID0gbigpLCAKICAgICAgICAgICAgY2h1cm5lZCA9IHN1bShjaHVybmluZCkpICU+JQogIAogIGdyb3VwX2J5KHNldF90eXBlKSAlPiUKICBtdXRhdGUoY2h1cm5fcmF0ZSA9IGNodXJuZWQgLyBudW1fb2JzLAogICAgICAgICByZW5ld19yYXRlID0gMSAtIGNodXJuX3JhdGUsCiAgICAgICAgIG1vbnRoX2NodXJuID0gMSAtIHJlbmV3X3JhdGUgXiAoMS9hcy5kb3VibGUobW9udGhzKSksCiAgICAgICAgIGxvZ19tb250aF9jaHVybiA9IGxvZyhtb250aF9jaHVybiksCiAgICAgICAgIHdlaWdodCA9IG51bV9vYnMgLyBzdW0obnVtX29icykpCgojIE5CISBEb2VzIHRoaXMgaW50cm9kdWNlIGEgYmFkIGJpYXMgPz8/PwpjaHVybnRhYmxlX25vX3plcm9zIDwtIGNodXJudGFibGUgJT4lCiAgZmlsdGVyKGNodXJuX3JhdGUgPiAwKQpgYGAKClRyYWluIG1vZGVsCgpgYGB7cn0KbmV3X21vZGVsPWdsbShsb2dfbW9udGhfY2h1cm4gfiBtYXJrZXRfY2F0ZWdvcnkgKyBzdWJzY3JpcHRpb25fc3VtbWFyeV9ub19tYXJrZXQsIGRhdGE9Y2h1cm50YWJsZV9ub196ZXJvc1tjaHVybnRhYmxlX25vX3plcm9zJHNldF90eXBlID09ICd0cmFpbmluZycsIF0sIHdlaWdodHMgPSB3ZWlnaHQpCmBgYAoKTW9kZWwgdmFsaWRhdGlvbiBmb3IgdHJhaW5pbmcgKDIwMTctMDEtMDEgLSAyMDE3LTA4LTAxKSBhbmQgdmFsaWRhdGlvbiAoMjAxNy0wOS0wMSAtIDIwMTgtMDEtMDEpIHNldHM6CgoqIFN1bW1hcnkgdGFibGUgY29udGFpbmluZwogICAgKyBOdW1iZXIgb2Ygb2JzZXJ2YXRpb25zIHdpdGhvdXQgcHJlZGljdGlvbgogICAgKyBBVUMsIGxvZ2xvc3MgLSBwcmVkaWN0aW9uIHF1YWxpdHkgbWV0cmljcwoqIFJPQyBjdXJ2ZQoqIFBsb3RzIHBlciBtYXJrZXQKICAgICsgQWdlIG9mIGN1c3RvbWVycyB2cyByZWFsIGFuZCBwcmVkaWN0ZWQgcHJvYmFiaWxpdHkgb2YgY2h1cm4gZm9yIGRpZmZlcmVudCBzdWJzY3JpcHRpb24gbGVuZ3RocwogICAgKyBDYWxpYnJhdGlvbiAtIFByZWRpY3RlZCBwcm9iYWJpbGl0eSBvZiBjaHVybiB2cyByZWFsIHByb2JhYmlsaXR5IG9mIGNodXJuIGZvciBkaWZmZXJlbnQgc3Vic2NyaXB0aW9uIGxlbmd0aHMgKHdlbGwgY2FsaWJyYXRlZCBwcmVkaWN0aW9uIHNob3VsZCBmb3JtIGEgZGlhZ29uYWwgbGluZSkKCmBgYHtyIHdhcm5pbmc9Rn0KcHJlZGljdGlvbl90YWJsZSA8LSB2YWxpZGF0aW9uKHN1YnNjcmlwdGlvbnNfd2l0aF90YXJnZXQsIG5ld19tb2RlbCwgcHJlZGljdF8yZmN0X21vZGVsKQpgYGAKCmBgYHtyIGZpZy5oZWlnaHQ9MTAsIGZpZy53aWR0aD03LCB3YXJuaW5nPUZ9CnZhbGlkYXRpb25fcGxvdHMocHJlZGljdGlvbl90YWJsZSkKYGBgCgpUcnkgc2ltcGxlIGxvZ2lzdGljIG1vZGVsCmBgYHtyfQptb2RlbF9sb2dpdCA8LSBnbG0oY2h1cm5pbmQgfiBtYXJrZXRfY2F0ZWdvcnkgKyBzaXRldmVya2V5X2NhdDIgKyBudW1fcHJldmlvdXNfbW9udGhzX2Jpbm5lZCArIG1vbnRocyArIGNob3Nlbl9zdWJzX2xlbmd0aCwKICAgICAgICAgICAgICAgICAgIGRhdGEgPSBzdWJzY3JpcHRpb25zX3dpdGhfdGFyZ2V0W3N1YnNjcmlwdGlvbnNfd2l0aF90YXJnZXQkc2V0X3R5cGUgPT0gJ3RyYWluaW5nJyxdLCBmYW1pbHkgPSAnYmlub21pYWwnKQpgYGAKCgpNb2RlbCB2YWxpZGF0aW9uIGZvciB0cmFpbmluZyAoMjAxNy0wMS0wMSAtIDIwMTctMDgtMDEpIGFuZCB2YWxpZGF0aW9uICgyMDE3LTA5LTAxIC0gMjAxOC0wMS0wMSkgc2V0czoKCiogU3VtbWFyeSB0YWJsZSBjb250YWluaW5nCiAgICArIE51bWJlciBvZiBvYnNlcnZhdGlvbnMgd2l0aG91dCBwcmVkaWN0aW9uCiAgICArIEFVQywgbG9nbG9zcyAtIHByZWRpY3Rpb24gcXVhbGl0eSBtZXRyaWNzCiogUk9DIGN1cnZlCiogUGxvdHMgcGVyIG1hcmtldAogICAgKyBBZ2Ugb2YgY3VzdG9tZXJzIHZzIHJlYWwgYW5kIHByZWRpY3RlZCBwcm9iYWJpbGl0eSBvZiBjaHVybiBmb3IgZGlmZmVyZW50IHN1YnNjcmlwdGlvbiBsZW5ndGhzCiAgICArIENhbGlicmF0aW9uIC0gUHJlZGljdGVkIHByb2JhYmlsaXR5IG9mIGNodXJuIHZzIHJlYWwgcHJvYmFiaWxpdHkgb2YgY2h1cm4gZm9yIGRpZmZlcmVudCBzdWJzY3JpcHRpb24gbGVuZ3RocyAod2VsbCBjYWxpYnJhdGVkIHByZWRpY3Rpb24gc2hvdWxkIGZvcm0gYSBkaWFnb25hbCBsaW5lKQoKYGBge3Igd2FybmluZz1GfQpwcmVkaWN0aW9uX3RhYmxlX2xvZ2l0IDwtIHZhbGlkYXRpb24oc3Vic2NyaXB0aW9uc193aXRoX3RhcmdldCwgbW9kZWxfbG9naXQpCmBgYAoKYGBge3IgZmlnLmhlaWdodD0xMCwgZmlnLndpZHRoPTcsIHdhcm5pbmc9Rn0KdmFsaWRhdGlvbl9wbG90cyhwcmVkaWN0aW9uX3RhYmxlX2xvZ2l0KQpgYGA=